{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "zsh:1: command not found: nvidia-smi\n"
     ]
    }
   ],
   "source": [
    "!nvidia-smi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import RobertaTokenizer, RobertaTokenizerFast, RobertaForSequenceClassification\n",
    "from transformers import get_linear_schedule_with_warmup, AdamW\n",
    "import torch\n",
    "from torch.utils.data import TensorDataset, DataLoader\n",
    "from torch.utils.data import RandomSampler, SequentialSampler\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report, precision_score, recall_score\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import time\n",
    "import datetime\n",
    "import random\n",
    "import os\n",
    "import helper_functions as hf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_excel(\"fake_news_covid.xlsx\")\n",
    "data = data[[\"label_text\", \"text\"]]\n",
    "dataF = data[data.label_text==\"F\"][0:500]\n",
    "dataT = data[data.label_text==\"T\"][0:500]\n",
    "dataU = data[data.label_text==\"U\"][0:500]\n",
    "data = pd.concat([dataF, dataT, dataU])\n",
    "len(data)\n",
    "data[\"label_text\"].value_counts()\n",
    "data = data.sample(frac=1).reset_index(drop=True)\n",
    "print(len(data))\n",
    "data[\"text\"] = data[\"text\"].str.replace('\\n', ' ')\n",
    "data[\"label\"] = data[\"label_text\"].astype('category')\n",
    "data[\"label\"] = data[\"label\"].cat.codes\n",
    "print(data[\"label\"].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = RobertaTokenizerFast.from_pretrained(\"covid_roberta_40\", do_lower_case=True)\n",
    "\n",
    "input_ids = []\n",
    "lengths = []\n",
    "for x, row in data.iterrows():\n",
    "    encoded_sent = tokenizer.encode(\n",
    "                        row['text'],                      \n",
    "                        add_special_tokens = True,\n",
    "                   )\n",
    "    input_ids.append(encoded_sent)\n",
    "    lengths.append(len(encoded_sent))\n",
    "\n",
    "print('{:>10,} comments'.format(len(input_ids)))\n",
    "print('   Min length: {:,} tokens'.format(min(lengths)))\n",
    "print('   Max length: {:,} tokens'.format(max(lengths)))\n",
    "print('Median length: {:,} tokens'.format(np.median(lengths)))\n",
    "\n",
    "hf.plot_distribution(lengths)\n",
    "\n",
    "max_len = 160 #max(lengths)\n",
    "\n",
    "num_truncated = np.sum(np.greater(lengths, max_len))\n",
    "num_sentences = len(lengths)\n",
    "prcnt = float(num_truncated) / float(num_sentences)\n",
    "print('{:,} of {:,} sentences ({:.1%}) in the training set are longer than {:} tokens.'.format(num_truncated, num_sentences, prcnt, max_len))\n",
    "\n",
    "# create tokenized data\n",
    "labels = []\n",
    "input_ids = []\n",
    "attn_masks = []\n",
    "\n",
    "for x, row in data.iterrows():\n",
    "    encoded_dict = tokenizer.encode_plus(row['text'],\n",
    "                                              max_length=max_len, #see other code for how to set this\n",
    "                                              padding='max_length',\n",
    "                                              truncation=True,\n",
    "                                              return_tensors='pt')\n",
    "    input_ids.append(encoded_dict['input_ids'])\n",
    "    attn_masks.append(encoded_dict['attention_mask'])\n",
    "    labels.append(row['label'])\n",
    "\n",
    "\n",
    "# Convert into tensor matrix.\n",
    "input_ids = torch.cat(input_ids, dim=0)\n",
    "attn_masks = torch.cat(attn_masks, dim=0)\n",
    "\n",
    "# Labels list to tensor.\n",
    "labels = torch.tensor(labels)\n",
    "\n",
    "# Create TensorDataset.\n",
    "dataset = TensorDataset(input_ids, attn_masks, labels)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#########\n",
    "# Specify key model parameters here: \n",
    "model_name = \"covid_roberta_40\"\n",
    "lr = 3e-5\n",
    "epochs = 5\n",
    "batch_size = 32\n",
    "#########"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ======================================== #\n",
    "#              CV Training                 #\n",
    "# ======================================== #\n",
    "\n",
    "seed_val = 6\n",
    "random.seed(seed_val)\n",
    "np.random.seed(seed_val)\n",
    "torch.manual_seed(seed_val)\n",
    "torch.cuda.manual_seed_all(seed_val)\n",
    "torch.cuda.empty_cache() #Clear GPU cache if necessary\n",
    "\n",
    "training_stats = [] # Store training and validation loss,validation accuracy, and timings.\n",
    "fold_stats = []\n",
    "\n",
    "total_t0 = time.time() # Measure the total training time\n",
    "for run in range(0, 10):  \n",
    "    k_folds = 10\n",
    "    kfold = KFold(n_splits=k_folds, shuffle=True)\n",
    "\n",
    "    timestamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H%M%S')\n",
    "\n",
    "    for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):\n",
    "        \n",
    "        # Print\n",
    "        print(f'FOLD {fold+1}')\n",
    "        print('--------------------------------')\n",
    "        \n",
    "        # Sample elements randomly from a given list of ids, no replacement.\n",
    "        train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)\n",
    "        test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)\n",
    "        \n",
    "        # Define data loaders for training and testing data in this fold\n",
    "        train_dataloader = torch.utils.data.DataLoader(\n",
    "                        dataset, \n",
    "                        batch_size=batch_size, sampler=train_subsampler)\n",
    "        test_dataloader = torch.utils.data.DataLoader(\n",
    "                        dataset,\n",
    "                        batch_size=batch_size, sampler=test_subsampler)\n",
    "        \n",
    "        # Initiate model parameters for each fold\n",
    "        model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)\n",
    "        device = torch.device('cuda:0')\n",
    "        desc = model.to(device)\n",
    "        optimizer = AdamW(model.parameters(), lr = lr, eps = 1e-6) \n",
    "        total_steps = (int(len(dataset)/batch_size)+1) * epochs \n",
    "        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 10, num_training_steps = total_steps)\n",
    "            \n",
    "        # Run the training loop for defined number of epochs\n",
    "        for epoch_i in range(0, epochs):\n",
    "            print(\"\")\n",
    "            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))\n",
    "            print('Training...')\n",
    "            t0 = time.time()\n",
    "            total_train_loss = 0 # Reset the total loss for this epoch.\n",
    "            model.train() # Put the model into training mode.\n",
    "            update_interval = hf.good_update_interval( # Pick an interval on which to print progress updates.\n",
    "                        total_iters = len(train_dataloader),\n",
    "                        num_desired_updates = 10\n",
    "                    )\n",
    "\n",
    "            predictions_t, true_labels_t = [], []\n",
    "            for step, batch in enumerate(train_dataloader):\n",
    "                if (step % update_interval) == 0 and not step == 0:\n",
    "                    elapsed = hf.format_time(time.time() - t0)\n",
    "                    print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed), end='\\r')\n",
    "                b_input_ids = batch[0].to(device)\n",
    "                b_input_mask = batch[1].to(device)\n",
    "                b_labels = batch[2].to(device)\n",
    "                # Always clear any previously calculated gradients before performing a backward pass.\n",
    "                model.zero_grad()\n",
    "                # Perform a forward pass --returns the loss and the \"logits\"\n",
    "                loss = model(b_input_ids,\n",
    "                                attention_mask=b_input_mask,\n",
    "                                labels=b_labels)[0]\n",
    "                logits = model(b_input_ids,\n",
    "                                    attention_mask=b_input_mask,\n",
    "                                    labels=b_labels)[1]\n",
    "\n",
    "                # Accumulate the training loss over all of the batches so that we can calculate the average loss at the end.\n",
    "                total_train_loss += loss.item()\n",
    "                # Perform a backward pass to calculate the gradients.\n",
    "                loss.backward()\n",
    "                # Clip the norm of the gradients to 1.0. This is to help prevent the \"exploding gradients\" problem.\n",
    "                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n",
    "                # Update parameters and take a step using the computed gradient.\n",
    "                optimizer.step()\n",
    "                # Update the learning rate.\n",
    "                scheduler.step()\n",
    "                \n",
    "                logits = logits.detach().cpu().numpy()\n",
    "                label_ids = b_labels.to('cpu').numpy()\n",
    "                # Store predictions and true labels\n",
    "                predictions_t.append(logits)\n",
    "                true_labels_t.append(label_ids)\n",
    "\n",
    "            # Combine the results across all batches.\n",
    "            flat_predictions_t = np.concatenate(predictions_t, axis=0)\n",
    "            flat_true_labels_t = np.concatenate(true_labels_t, axis=0)\n",
    "            # For each sample, pick the label (0, 1) with the highest score.\n",
    "            predicted_labels_t = np.argmax(flat_predictions_t, axis=1).flatten()        \n",
    "            acc_t = accuracy_score(predicted_labels_t, flat_true_labels_t)\n",
    "            \n",
    "            # Calculate the average loss over all of the batches.\n",
    "            avg_train_loss = total_train_loss / len(train_dataloader)\n",
    "\n",
    "            # Measure how long this epoch took.\n",
    "            training_time = hf.format_time(time.time() - t0)\n",
    "\n",
    "            print(\"\")\n",
    "            print(\"  Average training loss: {0:.3f}\".format(avg_train_loss))\n",
    "            print(\"  Training epoch took: {:}\".format(training_time))\n",
    "            print(\"  Training accuracy: {:.3f}\".format(acc_t))\n",
    "            \n",
    "            if acc_t > 0.875 and epoch_i >= 2:\n",
    "                break              \n",
    "\n",
    "        # TEST\n",
    "        # After the completion of each training epoch, measure our performance on our test set.\n",
    "\n",
    "        print(\"\")\n",
    "        print(\"Running test...\")\n",
    "        t0 = time.time()\n",
    "        model.eval() # Put the model in evaluation mode--the dropout layers behave differently during evaluation.\n",
    "        total_eval_loss = 0\n",
    "        predictions, true_labels = [], []\n",
    "        # Evaluate data for one epoch\n",
    "        for batch in test_dataloader:\n",
    "            b_input_ids = batch[0].to(device)\n",
    "            b_input_mask = batch[1].to(device)\n",
    "            b_labels = batch[2].to(device)\n",
    "            with torch.no_grad():\n",
    "                # Forward pass, calculate logit predictions.\n",
    "                loss = model(b_input_ids,\n",
    "                                attention_mask=b_input_mask,\n",
    "                                labels=b_labels)[0]\n",
    "                logits = model(b_input_ids,\n",
    "                                    attention_mask=b_input_mask,\n",
    "                                    labels=b_labels)[1]\n",
    "            # Accumulate the test loss.\n",
    "            total_eval_loss += loss.item()\n",
    "            # Move logits and labels to CPU\n",
    "            logits = logits.detach().cpu().numpy()\n",
    "            label_ids = b_labels.to('cpu').numpy()\n",
    "            # Store predictions and true labels\n",
    "            predictions.append(logits)\n",
    "            true_labels.append(label_ids)\n",
    "\n",
    "        # Combine the results across all batches.\n",
    "        flat_predictions = np.concatenate(predictions, axis=0)\n",
    "        flat_true_labels = np.concatenate(true_labels, axis=0)\n",
    "        # For each sample, pick the label (0, 1) with the highest score.\n",
    "        predicted_labels = np.argmax(flat_predictions, axis=1).flatten()\n",
    "        # Calculate the test accuracy.\n",
    "        val_accuracy = (predicted_labels == flat_true_labels).mean()\n",
    "\n",
    "        # Calculate the average loss over all of the batches.\n",
    "        avg_val_loss = total_eval_loss / len(test_dataloader)\n",
    "        \n",
    "        ov_acc = [accuracy_score(predicted_labels, flat_true_labels), recall_score(predicted_labels, flat_true_labels, average=\"macro\"), precision_score(predicted_labels, flat_true_labels, average=\"macro\"),f1_score(predicted_labels, flat_true_labels, average=\"macro\")]\n",
    "        f1 = list(f1_score(flat_true_labels,predicted_labels,average=None))\n",
    "        matrix = confusion_matrix(flat_true_labels,predicted_labels)\n",
    "        acc = list(matrix.diagonal()/matrix.sum(axis=1))\n",
    "        cr = pd.DataFrame(classification_report(pd.Series(flat_true_labels),pd.Series(predicted_labels), output_dict=True)).transpose().iloc[0:3, 0:2]\n",
    "        prec =list(cr.iloc[:,0])\n",
    "        rec = list(cr.iloc[:,1]) \n",
    "\n",
    "        # Report the final accuracy for this test run.\n",
    "        print(\"  True F1: {0:.3f}\".format(f1[0]))\n",
    "        print(\"  Fake F1: {0:.3f}\".format(f1[1]))\n",
    "        print(\"  Undetermined F1: {0:.3f}\".format(f1[2]))\n",
    "        print('RoBERTa-Covid-40 Prediction F1: {:.3f}'.format(ov_acc[3]))\n",
    "        \n",
    "        # Measure how long the test run took.\n",
    "        test_time = hf.format_time(time.time() - t0)\n",
    "        print(\"  Test Loss: {0:.3f}\".format(avg_val_loss))\n",
    "        print(\"  Test took: {:}\".format(test_time))        \n",
    "\n",
    "        fold_stats.append(\n",
    "            {\n",
    "                'fold': fold+1,\n",
    "                'Training Loss': avg_train_loss,\n",
    "                'Test Loss': avg_val_loss,\n",
    "                'Test Accur.': ov_acc[0],\n",
    "                'True F1': f1[0],\n",
    "                'Fake F1': f1[1],\n",
    "                'Undetermined F1': f1[2],\n",
    "                'f1': [f1, ov_acc[3]],\n",
    "                'prec': [prec, ov_acc[2]],\n",
    "                'rec': [rec, ov_acc[1]]\n",
    "            }\n",
    "        )\n",
    "                       \n",
    "        print(\"\")\n",
    "        print(\"  Running Fake F1 mean: {0:.3f}\".format(np.mean([x[\"Fake F1\"] for x in fold_stats])))\n",
    "        print(\"  Running True F1 mean: {0:.3f}\".format(np.mean([x[\"True F1\"] for x in fold_stats])))\n",
    "        print(\"  Running Undet. F1 mean: {0:.3f}\".format(np.mean([x[\"Undetermined F1\"] for x in fold_stats])))\n",
    "        print(\"\")\n",
    "\n",
    "    fakenews_stats = []\n",
    "    fakenews_stats.append(\n",
    "        {\n",
    "            'Model': model_name,\n",
    "            'lr': lr,\n",
    "            'epochs': epochs,\n",
    "            'batch_size': batch_size,\n",
    "            'tok': max_len,\n",
    "            \n",
    "            'fake_mean_f1': np.mean([x['f1'][0][0] for x in fold_stats ]),\n",
    "            'fake_mean_f1_sd': np.std([x['f1'][0][0] for x in fold_stats ]),\n",
    "            'fake_recall': np.mean([x['rec'][0][0] for x in fold_stats ]),\n",
    "            'fake_recall_sd': np.std([x['rec'][0][0] for x in fold_stats ]),\n",
    "            'fake_prec': np.mean([x['prec'][0][0] for x in fold_stats ]),\n",
    "            'fake_prec_sd': np.std([x['prec'][0][0] for x in fold_stats ]),\n",
    "            \n",
    "            'true_mean_f1': np.mean([x['f1'][0][1] for x in fold_stats ]),\n",
    "            'true_mean_f1_sd': np.std([x['f1'][0][1] for x in fold_stats ]),\n",
    "            'true_recall': np.mean([x['rec'][0][1] for x in fold_stats ]),\n",
    "            'true_recall_sd': np.std([x['rec'][0][1] for x in fold_stats ]),\n",
    "            'true_prec': np.mean([x['prec'][0][1] for x in fold_stats ]),\n",
    "            'true_prec_sd': np.std([x['prec'][0][1] for x in fold_stats ]),\n",
    "            \n",
    "            'undet_mean_f1': np.mean([x['f1'][0][2] for x in fold_stats ]),\n",
    "            'undet_mean_f1_sd': np.std([x['f1'][0][2] for x in fold_stats ]),\n",
    "            'undet_recall': np.mean([x['rec'][0][2] for x in fold_stats ]),\n",
    "            'undet_recall_sd': np.std([x['rec'][0][2] for x in fold_stats ]),\n",
    "            'undet_prec': np.mean([x['prec'][0][2] for x in fold_stats ]),\n",
    "            'undet_prec_sd': np.std([x['prec'][0][2] for x in fold_stats ]),\n",
    "            \n",
    "            'overall_mean': np.mean([x['Test Accur.'] for x in fold_stats ]),\n",
    "            'overall_mean_sd': np.std([x['Test Accur.'] for x in fold_stats ]),\n",
    "            'overall_mean_f1': np.mean([x['f1'][1] for x in fold_stats ]),\n",
    "            'overall_mean_f1_sd': np.std([x['f1'][1] for x in fold_stats ]),\n",
    "            'overall_recall': np.mean([x['rec'][1] for x in fold_stats ]),\n",
    "            'overall_recall_sd': np.std([x['rec'][1] for x in fold_stats ]),\n",
    "            'overall_prec': np.mean([x['prec'][1] for x in fold_stats ]),\n",
    "            'overall_prec_sd': np.std([x['prec'][1] for x in fold_stats ]),\n",
    "        }\n",
    "    )  \n",
    "\n",
    "    import json\n",
    "    with open('fold_stats_roberta_pt_40_' + timestamp + '.txt', 'w') as outfile:\n",
    "        json.dump(fold_stats, outfile)\n",
    "    with open('fakenews_results_roberta_pt_40_' + timestamp + '.txt', 'w') as outfile:\n",
    "        json.dump(fakenews_stats, outfile)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.8.10 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  },
  "vscode": {
   "interpreter": {
    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
